import pandas as pd
import numpy as np
import datetime as dt
import plotly as ply
from sklearn.model_selection import ShuffleSplit as ss
from sklearn.linear_model import LogisticRegression as lr
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler as sts
from sklearn.pipeline import Pipeline as pl
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from pandas.tools.plotting import boxplot
from __future__ import print_function as pr
##working object will be read latter on
#rainfall_original = pd.read_csv('weatherAus.csv')
#functions to find the average value using the bracketing values around the NaN, will only use values if they are
# from the same city.
#if NaN is the earliest timepoint for a given city the next timepoint with no NaN will be given instead of the mean
#if NaN is the latest timepoint for a given city the previous timepoint with no NaN will be given instead of the mean
def impute_by_city(cities,variables):
for c in cities:
temp = rainfall[rainfall.Location == c] #parse out observations from a single city
#interate through all observations of the temp data file
i = min(temp.index)
while i <= max(temp.index):
for v in variables:
if pd.isna(temp[v]).all():
pass
elif pd.isna(temp[v][i]):
temp[v][i] = find_mean(temp[v], i)
rainfall[v][i] = temp[v][i]
i = i + 1
def find_mean(templist, index):
if index == min(templist.index): #if earliest timepoint take the next value that is not NaN
return find_top(templist, index)
elif index == max(templist.index): #if latest timepoint take the previous value that is not NaN
return find_bottom(templist, index)
else:
bottom = find_bottom(templist, index) #find previous non-NaN value
top = find_top(templist, index) #find next non-NaN value
if pd.isna(top): #if there are no more non-NaN values return the previous non-NaN value
return bottom
else:
mean = (top + bottom)/2
return mean
#find previous non-NaN value
def find_bottom(templist, index):
while pd.isna(templist[index-1]):
index = index-1
bottom = templist[index-1]
return bottom
#find next non-NaN value
#if there are no more non-NaN values return the previous non-NaN value
def find_top(templist, index):
while pd.isna(templist[index+1]):
index = index+1
if index == max(templist.index):
top = np.nan
return top
top = templist[index+1]
return top
##can be skipped if rainfall.csv already generated!
#rainfall = rainfall_original.copy()
#rainfall.drop(["RISK_MM"], axis=1, inplace=True) #RISK_MM was used to extrapolate response variable.
#rainfall['Date'] = pd.to_datetime(rainfall['Date'], format='%Y-%m-%d') #change 'Date' variable to datetime64
#rainfall.dropna(subset=["RainToday"], inplace=True) #drop any observation with no record of rainfall for the day,
# cannot be imputed
#rainfall = rainfall.reset_index(drop=True) #reset the index after drops
#rainfall.info()
##can be skipped if rainfall.csv already generated!
##set the cardinal directions to degrees
#directions = {'N':0, 'NNE':22.5, 'NE':45, 'NE':45, 'ENE':67.5, 'E':90, 'ESE':112.5, 'SE':135, 'SSE':157.5, 'S':180,\
# 'SSW':202.5, 'SW':225, 'WSW':247.5, 'W':270, 'WNW':292.5, 'NW':315, 'NNW':337.5}
#cities = rainfall.Location.unique() #get name of all cities in the data frame
#c_variables = [] #variables with continuous values
#d_variables = [] #variables with discreet values
#rainfall = rainfall.replace(directions) #replace cardianl direction to their corresponding degrees
##change 'Yes' and 'No' to 1 and 0 respectively
#rainfall.RainToday = rainfall.RainToday=='Yes'
#rainfall.RainToday = rainfall.RainToday.astype(np.int)
#rainfall.RainTomorrow = rainfall.RainTomorrow=='Yes'
#rainfall.RainTomorrow = rainfall.RainTomorrow.astype(np.int)
#for l in list(rainfall):
# if (rainfall[l].dtypes == 'float64'):
# c_variables.append(l)
# else:
# d_variables.append(l)
##can be skipped if rainfall.csv already generated!
##very expensive, rainfall.csv can be uploaded from work directory
#impute_by_city(cities, c_variables) #impute values to NaN
#rainfall.to_csv("rainfall.csv", sep=',', index=True) #save to csv for later use
#load pre-generated rainfall.csv file
rainfall = pd.read_csv('rainfall.csv', index_col=0)
rainfall.info()
rainfall = rainfall.drop(['Evaporation', 'Sunshine'], axis = 1) #dropped variable becuase there are too many NaN's
l = list(rainfall.Location.unique())
rainfall.dropna(subset = list(rainfall), inplace = True) #dropped city becuase variable was not recorded by said city
#check to see which city was droped
for i in l:
if i not in rainfall.Location.unique():
print(i)
rainfall = rainfall.drop(['Date', 'Location'], axis = 1) #not needed for prediction
lr_rainfall = rainfall.copy()
# we want to predict the X and y data as follows:
if 'RainTomorrow' in lr_rainfall:
y = lr_rainfall['RainTomorrow'].values # get the labels we want
del lr_rainfall['RainTomorrow'] # get rid of the class label
x = lr_rainfall.values # use everything else to predict!
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size = 0.2)
print(cv_object)
# iterate over the coefficients
column_names = lr_rainfall.columns
weights = []
weights_array = []
scl_obj = sts()
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(x,y)):
scl_obj.fit(x[train_indices]) # find scalings for each column that make this zero mean and unit std
X_train_scaled = scl_obj.transform(x[train_indices]) # apply to training
X_test_scaled = scl_obj.transform(x[test_indices]) # apply those means and std to the test set (without snooping at the test set values)
# train the model just as before
lr_clf = lr(penalty='l2', C=0.05) # get object, the 'C' value is less (can you guess why??)
lr_clf.fit(X_train_scaled,y[train_indices]) # train object
y_hat = lr_clf.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y[test_indices],y_hat)
conf = mt.confusion_matrix(y[test_indices],y_hat)
print("")
print('accuracy:', acc )
print(conf )
# sort these attributes and spit them out
#zip_vars = zip(lr_clf.coef_.T,column_names) # combine attributes
#zip_vars = sorted(zip_vars)
zip_vars = pd.Series(lr_clf.coef_[0].T, index=column_names)
for name, coef in zip_vars.items():
print(name, 'has weight of', coef) # now print them out
weights.append(coef)
weights_array.append(weights)
weights = []
weights_array = np.array(weights_array)
ply.offline.init_notebook_mode() # run at the start of every notebook
mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])
error_y=dict(
type='data',
array=final_array['std'].values,
visible=True
)
graph1 = {'x': final_array.index,
'y': final_array['mean'].values,
'error_y':error_y,
'type': 'bar'}
fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Logistic Regression Weights, with error bars'}
ply.offline.iplot(fig)
#grab coefficents with absolute values greater than user determined ammount
cutoff = 0.5
lr_voi = []
for index, columns in final_array.iterrows():
if (columns['mean'] > cutoff) or (columns['mean'] < -cutoff):
lr_voi.append(index)
lr_rainfall['RainTomorrow'] = y #add it back in for the original data
# now lets see the statistics of these attributes
df_grouped = lr_rainfall.groupby(['RainTomorrow'])
# plot KDE of Different variables
vars_to_plot = lr_voi
for v in vars_to_plot:
plt.figure(figsize=(10,4))
# plot original distributions
plt.subplot(1,2,2)
ax = df_grouped[v].plot.kde()
plt.legend(['no rain','rained'])
plt.title(v+' (Original)')
svm_rainfall = rainfall.copy()
# we want to predict the X and y data as follows:
if 'RainTomorrow' in lr_rainfall:
y = svm_rainfall['RainTomorrow'].values # get the labels we want
del svm_rainfall['RainTomorrow'] # get rid of the class label
x = svm_rainfall.values # use everything else to predict!
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size = 0.2)
print(cv_object)
weights = []
weights_array = []
# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(x,y):
# I will create new variables here so that it is more obvious what
# the code is doing (you can compact this syntax and avoid duplicating memory,
# but it makes this code less readable)
X_train = x[train_indices]
y_train = y[train_indices]
X_test = x[test_indices]
y_test = y[test_indices]
X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test)
#train the model just as before
svm_clf = SVC(C=0.5, kernel='linear', degree=3, gamma='auto') # get object
svm_clf.fit(X_train_scaled, y_train) # train object
y_hat = svm_clf.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print("")
print('accuracy:', acc )
print(conf)
# sort these attributes and spit them out
zip_vars = pd.Series(svm_clf.coef_[0],index=column_names) # combine attributes
for name, coef in zip_vars.items():
print(name, 'has weight of', coef) # now print them out
weights.append(coef)
weights_array.append(weights)
weights = []
weights_array = np.array(weights_array)
# look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )
ply.offline.init_notebook_mode() # run at the start of every notebook
mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])
error_y=dict(
type='data',
array=final_array['std'].values,
visible=True
)
graph1 = {'x': final_array.index,
'y': final_array['mean'].values,
'error_y':error_y,
'type': 'bar'}
fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Support Vector Machines Weights, with error bars'}
ply.offline.iplot(fig)
#grab coefficents with absolute values greater than user determined ammount
cutoff = 0.5
svm_voi = []
for index, columns in final_array.iterrows():
if (columns['mean'] > cutoff) or (columns['mean'] < -cutoff):
svm_voi.append(index)
# make a dataframe of the training data
df_tested_on = svm_rainfall.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_clf.support_,:]
df_support['RainTomorrow'] = y[svm_clf.support_] # add back in the 'Survived' Column to the pandas dataframe
svm_rainfall['RainTomorrow'] = y # also add it back in for the original data
df_support.info()
# group the original data and the support vectors
df_grouped_support = df_support.groupby(['RainTomorrow'])
df_grouped = svm_rainfall.groupby(['RainTomorrow'])
# plot KDE of Different variables
vars_to_plot = svm_voi
for v in vars_to_plot:
plt.figure(figsize=(10,4))
# plot support vector stats
plt.subplot(1,2,1)
ax = df_grouped_support[v].plot.kde()
plt.legend(['no rain','rained'])
plt.title(v+' (Instances chosen as Support Vectors)')
# plot original distributions
plt.subplot(1,2,2)
ax = df_grouped[v].plot.kde()
plt.legend(['no rain','rained'])
plt.title(v+' (Original)')